home *** CD-ROM | disk | FTP | other *** search
/ Mac Easy 2010 May / Mac Life Ubuntu.iso / casper / filesystem.squashfs / var / lib / python-support / python2.6 / rdflib / syntax / parsers / RDFaParser.pyc (.txt) < prev    next >
Encoding:
Python Compiled Bytecode  |  2009-04-20  |  9.7 KB  |  331 lines

  1. # Source Generated with Decompyle++
  2. # File: in.pyc (Python 2.6)
  3.  
  4. """
  5. RDFa parser.
  6.  
  7. RDFa is a set of attributes used to embed RDF in XHTML. An important goal of
  8. RDFa is to achieve this RDF embedding without repeating existing XHTML content
  9. when that content is the metadata.
  10.  
  11. REFERENCES:
  12.  
  13. \thttp://www.w3.org/2001/sw/BestPractices/HTML/2005-rdfa-syntax
  14.  
  15. LICENSE:
  16.  
  17.   BSD
  18.  
  19. CHANGE HISTORY:
  20.  
  21.   2006/06/03 - Initial Version
  22.   2006/06/08 - Added support for role (as per primer not syntax spec)
  23.                Added support for plaintext and flattening of XMLLiterals
  24.                ... (Sections 5.1.1.2 and 5.1.2.1)
  25.                Fixed plaintext bug where it was being resolved as CURIE
  26.                Added support to skip reserved @rel keywords from:
  27.                  http://www.w3.org/TR/REC-html40/types.html#h-6.12
  28.   2006/08/12 - Changed reserved @rel resolution to include a '#'
  29.                Fixed subject resolution for LINK/META when inside HEAD
  30.                Fixed blank node extraction [_:address] -> [_:_:address]
  31.                Added support for passing prefix mappings to the Graph
  32.                via RDFaSink
  33.                Added @id support as part of subject resolution
  34.  
  35. Copyright (c) 2006, Elias Torres <elias@torrez.us>
  36.  
  37. """
  38. import sys
  39. import re
  40. import urllib
  41. import urlparse
  42. import cStringIO
  43. import string
  44. from xml.dom import pulldom
  45. from rdflib.syntax.parsers import Parser
  46. from rdflib.Graph import ConjunctiveGraph
  47. from rdflib import URIRef
  48. from rdflib import BNode
  49. from rdflib import Literal
  50. from rdflib import Namespace
  51. __version__ = '$Id: RDFaParser.py 1072 2007-03-30 18:12:54Z eliast $'
  52. rdfa_attribs = [
  53.     'about',
  54.     'property',
  55.     'rel',
  56.     'rev',
  57.     'href',
  58.     'content',
  59.     'role',
  60.     'id']
  61. reserved_links = [
  62.     'alternate',
  63.     'stylesheet',
  64.     'start',
  65.     'next',
  66.     'prev',
  67.     'contents',
  68.     'index',
  69.     'glossary',
  70.     'copyright',
  71.     'chapter',
  72.     'section',
  73.     'subsection',
  74.     'appendix',
  75.     'help',
  76.     'bookmark']
  77. xhtml = Namespace('http://www.w3.org/1999/xhtml')
  78. xml = Namespace('http://www.w3.org/XML/1998/namespace')
  79. rdf = Namespace('http://www.w3.org/1999/02/22-rdf-syntax-ns#')
  80.  
  81. class RDFaSink(object):
  82.     
  83.     def __init__(self, graph):
  84.         self.graph = graph
  85.  
  86.     
  87.     def __str__(self):
  88.         return self.graph.serialize(format = 'pretty-xml')
  89.  
  90.     
  91.     def triple(self, s, p, o):
  92.         self.graph.add((s, p, o))
  93.  
  94.     
  95.     def prefix(self, prefix, ns):
  96.         self.graph.bind(prefix, ns, override = False)
  97.  
  98.  
  99. _urifixer = re.compile('^([A-Za-z][A-Za-z0-9+-.]*://)(/*)(.*?)')
  100.  
  101. def _urljoin(base, uri):
  102.     uri = _urifixer.sub('\\1\\3', uri)
  103.     return urlparse.urljoin(base, uri)
  104.  
  105.  
  106. class RDFaParser(Parser):
  107.     
  108.     def __init__(self):
  109.         self.lang = None
  110.         self.abouts = []
  111.         self.xmlbases = []
  112.         self.langs = []
  113.         self.elementStack = [
  114.             None]
  115.         self.bcounter = { }
  116.         self.bnodes = { }
  117.         self.sink = None
  118.  
  119.     
  120.     def parse(self, source, sink, baseURI = None):
  121.         self.sink = RDFaSink(sink)
  122.         self.triple = self.sink.triple
  123.         self.prefix = self.sink.prefix
  124.         if not baseURI:
  125.             pass
  126.         self.baseuri = source.getPublicId()
  127.         f = source.getByteStream()
  128.         events = pulldom.parse(f)
  129.         self.handler = events.pulldom
  130.         for None in events:
  131.             (event, node) = None
  132.             if event == pulldom.START_DOCUMENT:
  133.                 self.abouts += [
  134.                     (URIRef(''), node)]
  135.             
  136.             if event == pulldom.END_DOCUMENT:
  137.                 if not len(self.elementStack) == 0:
  138.                     raise AssertionError
  139.             
  140.             if event == pulldom.START_ELEMENT:
  141.                 self.elementStack += [
  142.                     node]
  143.                 found = (filter,)((lambda x: x in node.attributes.keys()), rdfa_attribs)
  144.                 if not node.getAttributeNS(xml, 'base') and node.getAttribute('xml:base'):
  145.                     pass
  146.                 baseuri = self.baseuri
  147.                 self.baseuri = _urljoin(self.baseuri, baseuri)
  148.                 self.xmlbases.append(self.baseuri)
  149.                 if node.hasAttributeNS(xml, 'lang') or node.hasAttribute('xml:lang'):
  150.                     if not node.getAttributeNS(xml, 'lang'):
  151.                         pass
  152.                     lang = node.getAttribute('xml:lang')
  153.                     if lang == '':
  154.                         lang = None
  155.                     
  156.                 else:
  157.                     lang = self.lang
  158.                 self.lang = lang
  159.                 self.langs.append(lang)
  160.                 if len(found) == 0:
  161.                     continue
  162.                 
  163.                 parentNode = self.elementStack[-2]
  164.                 if 'about' in found:
  165.                     self.abouts += [
  166.                         (self.extractCURIEorURI(node.getAttribute('about')), node)]
  167.                 elif 'id' in found:
  168.                     self.abouts += [
  169.                         (self.extractCURIEorURI('#' + node.getAttribute('id')), node)]
  170.                 
  171.                 subject = self.abouts[-1][0]
  172.                 if node.tagName == 'meta' or node.tagName == 'link':
  173.                     if 'about' not in found and parentNode:
  174.                         if parentNode and parentNode.tagName == 'head':
  175.                             subject = URIRef('')
  176.                         elif parentNode.hasAttribute('about'):
  177.                             subject = self.extractCURIEorURI(parentNode.getAttribute('about'))
  178.                         elif parentNode.hasAttributeNS(xml, 'id') or parentNode.hasAttribute('id'):
  179.                             if not parentNode.getAttributeNS(xml, 'id'):
  180.                                 pass
  181.                             id = parentNode.getAttribute('id')
  182.                             subject = self.extractCURIEorURI('#' + id)
  183.                         else:
  184.                             subject = self.generateBlankNode(parentNode)
  185.                     
  186.                 
  187.                 if 'property' in found:
  188.                     predicate = self.extractCURIEorURI(node.getAttribute('property'))
  189.                     literal = None
  190.                     datatype = None
  191.                     plaintext = False
  192.                     if node.hasAttribute('datatype'):
  193.                         sdt = node.getAttribute('datatype')
  194.                         if sdt != 'plaintext':
  195.                             datatype = self.extractCURIEorURI(sdt)
  196.                         else:
  197.                             plaintext = True
  198.                     
  199.                     if node.hasAttribute('content'):
  200.                         literal = Literal(node.getAttribute('content'), lang = lang, datatype = datatype)
  201.                     else:
  202.                         events.expandNode(node)
  203.                         self._popStacks(event, node)
  204.                         content = ''
  205.                         for child in node.childNodes:
  206.                             if datatype or plaintext:
  207.                                 content += self._getNodeText(child)
  208.                                 continue
  209.                             content += child.toxml()
  210.                         
  211.                         content = content.strip()
  212.                         if not datatype:
  213.                             pass
  214.                         literal = Literal(content, datatype = rdf.XMLLiteral)
  215.                     if literal:
  216.                         self.triple(subject, predicate, literal)
  217.                     
  218.                 
  219.                 if 'rel' in found:
  220.                     rel = node.getAttribute('rel').strip()
  221.                     if string.lower(rel) in reserved_links:
  222.                         rel = xhtml['#' + string.lower(rel)]
  223.                     
  224.                     predicate = self.extractCURIEorURI(rel)
  225.                     if node.hasAttribute('href'):
  226.                         object = self.extractCURIEorURI(node.getAttribute('href'))
  227.                         self.triple(subject, predicate, object)
  228.                     
  229.                 
  230.                 if 'rev' in found:
  231.                     predicate = self.extractCURIEorURI(node.getAttribute('rev'))
  232.                     if node.hasAttribute('href'):
  233.                         object = self.extractCURIEorURI(node.getAttribute('href'))
  234.                         self.triple(object, predicate, subject)
  235.                     
  236.                 
  237.                 if 'role' in found:
  238.                     type = self.extractCURIEorURI(node.getAttribute('role'))
  239.                     self.triple(subject, rdf.type, type)
  240.                 
  241.             
  242.             if event == pulldom.END_ELEMENT:
  243.                 self._popStacks(event, node)
  244.                 continue
  245.         
  246.         for nsc in self.handler._ns_contexts:
  247.             for ns, prefix in nsc.items():
  248.                 self.prefix(prefix, ns)
  249.             
  250.         
  251.         f.close()
  252.  
  253.     
  254.     def _getNodeText(self, node):
  255.         if node.nodeType in (3, 4):
  256.             return node.nodeValue
  257.         text = ''
  258.         for child in node.childNodes:
  259.             if child.nodeType in (3, 4):
  260.                 text = text + child.nodeValue
  261.                 continue
  262.             node.nodeType in (3, 4)
  263.         
  264.         return text
  265.  
  266.     
  267.     def generateBlankNode(self, parentNode):
  268.         name = parentNode.tagName
  269.         if self.bnodes.has_key(parentNode):
  270.             return self.bnodes[parentNode]
  271.         if self.bcounter.has_key(name):
  272.             self.bcounter[name] = self.bcounter[name] + 1
  273.         else:
  274.             self.bcounter[name] = 0
  275.         self.bnodes[parentNode] = BNode('%s%d' % (name, self.bcounter[name]))
  276.         return self.bnodes[parentNode]
  277.  
  278.     
  279.     def extractCURIEorURI(self, resource):
  280.         if len(resource) > 0 and resource[0] == '[' and resource[-1] == ']':
  281.             resource = resource[1:-1]
  282.         
  283.         if resource.find(':') > -1:
  284.             (rpre, rsuf) = resource.split(':', 1)
  285.             for nsc in self.handler._ns_contexts:
  286.                 for ns, prefix in nsc.items():
  287.                     if prefix == rpre:
  288.                         resource = ns + rsuf
  289.                         continue
  290.                 
  291.             
  292.         
  293.         if len(resource) > 0 and resource[0:2] == '_:':
  294.             return BNode(resource[2:])
  295.         return URIRef(self.resolveURI(resource))
  296.  
  297.     
  298.     def resolveURI(self, uri):
  299.         if not self.baseuri:
  300.             pass
  301.         return _urljoin('', uri)
  302.  
  303.     
  304.     def _popStacks(self, event, node):
  305.         if len(self.abouts) != 0:
  306.             (about, aboutnode) = self.abouts[-1]
  307.             if aboutnode == node:
  308.                 self.abouts.pop()
  309.             
  310.         
  311.         self.elementStack.pop()
  312.         if self.xmlbases:
  313.             self.xmlbases.pop()
  314.             if self.xmlbases and self.xmlbases[-1]:
  315.                 self.baseuri = self.xmlbases[-1]
  316.             
  317.         
  318.         if self.langs:
  319.             self.langs.pop()
  320.             if self.langs and self.langs[-1]:
  321.                 self.lang = self.langs[-1]
  322.             
  323.         
  324.  
  325.  
  326. if __name__ == '__main__':
  327.     store = ConjunctiveGraph()
  328.     store.load(sys.argv[1], format = 'rdfa')
  329.     print store.serialize(format = 'pretty-xml')
  330.  
  331.